## This file generates the simulation populations and samples from them

## Simulation/dgp  parameters --------------------
numVar <- c(5, 10, 20) # Number of variables
popN <- 100000 # Population size = 100k; note that an earlier draft of this paper erroneously stated that the population size was 1mil
sampleSize <- c(25, 100, 250, 500, 750, 1000) # Sample sizes
numSims <- 100 # Number of simulations
pppn <- list("deterministic" = c(1,0),  # P(y=1 | x \in A)=1 and P(y=1 | x \notin A)=0; i.e., y=1 if and only if x satisfies the rule set
             "probabilistic" = c(.75, .25))  # P(y=1 | x \in A)=.75 and P(y=1 | x \notin A)=.25


## Generate data --------------------
allData <- list()
params <- c() # Specify parameters used to generate data; format:  J_pp_pn
index <- 1
set.seed(123)
for(j in 1:length(numVar)){
  J <- numVar[j]
  names <- paste(letters[1:J], "1", sep="_") # Names of variables
  Astar <- list(c("a_1", "b_1"), c("c_1", "d_1", "e_1_neg")) # True rule set
  for(k in 1:length(pppn)){
    pp <- pppn[[k]][1]
    pn <- pppn[[k]][2]
    params <- c(params, paste(J, pp, pn, sep="_"))
    data <- list()
    # Generate data
    for(i in 1:numSims){
      df <- genData(names, popN)
      Y <- getY(df, Astar, p_pos=pp, p_neg=pn)
      df <- cbind(df, Y)
      data[[i]] <- df
    }
    # Append data to end of list
    allData[[index]] <- data
    index <- index+1
  }
}
names(allData) <- params


## Sample from the populations and store the respective indices --------------------
set.seed(123)
allIndices <- list()
# Sample indices
for(i in 1:length(sampleSize)){
  n <- sampleSize[i]
  sampleIndices <- list()
  for(j in 1:numSims){
    sampleIndices[[j]] <- sample(c(1:popN), n)
  }
  allIndices[[i]] <- sampleIndices
}


## Save data --------------------
save(allData, file="sim/out/simData.rda")
save(allIndices, file="sim/out/simIndices.rda")